#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
library(biomaRt)
library(anRichment) ; library(BrainDiseaseCollection)
suppressWarnings(suppressMessages(library(WGCNA)))

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in 20_02_21_data_preprocessing.Rmd) and clustering (pipeline in 20_02_24_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)

genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))


clustering_selected = 'DynamicHybridMergedSmall'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds)


Relation to external clinical traits


Quantifying module-trait associations


Using the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.

datTraits = datMeta %>% dplyr::select(Diagnosis, Sex, Age, PMI, RNAExtractionBatch) %>%
            dplyr::rename('ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}
## [1] "2 correlation(s) could not be calculated"
rm(ME_object)

Note: The correlation between Module #E08B00 and Diagonsis is the one that cannot be calculated, weirdly enough, the thing that causes the error is that the initial correlation is too high, so it would be a very bad thing to lose this module because of this numerical error. I’m going to fill in its value using the polyserial function, which doesn’t give exactly the same results as the hetcor() function, but it’s quite similar.

# Calculate the correlation tha failed with hetcor()
moduleTraitCor['ME#E08B00','Diagnosis'] = polyserial(MEs[,'ME#E08B00'], datTraits$Diagnosis)
## Warning in polyserial(MEs[, "ME#E08B00"], datTraits$Diagnosis): initial
## correlation inadmissible, 1.07478833176392, set to 0.9999

I’m going to select all the modules that have an absolute correlation higher than 0.9 with Diagnosis to study them

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
                           'MTcor' = moduleTraitCor[,'Diagnosis'],
                           'MTpval' = moduleTraitPvalue[,'Diagnosis'])

genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')

rm(moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)


Studying the modules with the highest absolute correlation to Diagnosis


The modules consist mainly of points with very high (absolute) values in PC2 (which we know is related to lfc), so this result is consistent with the high correlation between Module and Diagnosis, although some of the points with the highest PC2 values do not belong to these top modules

top_modules = gsub('ME','',rownames(moduleTraitCor)[abs(moduleTraitCor[,'Diagnosis'])>0.9])

cat(paste0('Top modules selected: ', paste(top_modules, collapse=', '),'\n'))
## Top modules selected: #E08B00, #7DAE00, #FE6E8A
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
            dplyr::select(ID, external_gene_id, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', external_gene_id, ')'))

table(plot_data$ImportantModules)
## 
## #7DAE00 #E08B00 #FE6E8A  Others 
##     149     223      32   15750
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))
rm(pca)




Module Membership vs Gene Significance


create_plot = function(module){
  
  plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
  colnames(plot_data)[2] = 'Module'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
  
  p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) + ylab('Gene Significance') +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8))) + theme_minimal() + xlab('Module Membership') +
               ggtitle(paste0('Module ', module,'  (MTcor = ', round(moduleTraitCor[paste0('ME',module),1],2),')')))
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)




SFARI Genes


List of SFARI Genes in top modules ordered by SFARI score and Gene Significance

table_data = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
             dplyr::select(ID, external_gene_id, GS, gene.score, Module) %>% arrange(gene.score, desc(abs(GS))) %>%
             dplyr::rename('Ensembl ID'=ID, 'Gene Symbol'=external_gene_id, 
                    'SFARI score'=gene.score, 'Gene Significance'=GS)


kable(table_data %>% filter(Module == top_modules[1] & `SFARI score` != 'None') %>% dplyr::select(-Module),
      caption=paste0('SFARI Genes for Module ', top_modules[1]))
SFARI Genes for Module #E08B00
Ensembl ID Gene Symbol Gene Significance SFARI score
ENSG00000153827 TRIP12 0.6566880 1
ENSG00000135387 CAPRIN1 0.7630901 3
ENSG00000183454 GRIN2A 0.6185342 3
ENSG00000112902 SEMA5A 0.5854825 3
ENSG00000197635 DPP4 0.8471604 4
ENSG00000133216 EPHB2 0.8375782 4
ENSG00000011422 PLAUR 0.7699256 4
ENSG00000108055 SMC3 0.7376463 4
ENSG00000156298 TSPAN7 0.4841070 4
ENSG00000138095 LRPPRC 0.8541114 5
ENSG00000254093 PINX1 0.8284784 5
ENSG00000078725 BRINP1 0.7797796 5
ENSG00000122877 EGR2 0.7697789 5
ENSG00000012963 UBR7 0.6308559 5
ENSG00000179222 MAGED1 0.5233699 5
kable(table_data %>% filter(Module == top_modules[2] & `SFARI score` != 'None') %>% dplyr::select(-Module),
      caption=paste0('SFARI Genes for Module ', top_modules[2]))
SFARI Genes for Module #7DAE00
Ensembl ID Gene Symbol Gene Significance SFARI score
ENSG00000181804 SLC9A9 0.9338530 4
ENSG00000014138 POLA2 0.7761639 4
ENSG00000128594 LRRC4 0.4352575 4
ENSG00000151148 UBE3B 0.5519116 5
ENSG00000169359 SLC33A1 0.3298546 5
kable(table_data %>% filter(Module == top_modules[3] & `SFARI score` != 'None') %>% dplyr::select(-Module),
      caption=paste0('SFARI Genes for Module ', top_modules[3]))
SFARI Genes for Module #FE6E8A
Ensembl ID Gene Symbol Gene Significance SFARI score

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but this doesn’t seem to be the case

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) + geom_hline(yintercept=mean(plot_data$p), color='gray') +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row, plot_data)




Module Eigengenes


Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control.

In all cases, the Eigengenes separate the behaviour between autism and control samples very clearly!

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME',module)], 'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + geom_boxplot() + theme_minimal() + theme(legend.position='none') +
                    ggtitle(paste0('Module ', module, '  (MTcor=',round(moduleTraitCor[paste0('ME',module),1],2),')'))
  return(p)
}

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])

grid.arrange(p1, p2, p3, nrow=1)

rm(plot_EGs, p1, p2)




Identifying important genes


Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t that many SFARI genes in the top genes of the modules and not a single one belonging to scores 1 and 2

create_table = function(module){
  top_genes = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>% 
              dplyr::select(ID, external_gene_id, paste0('MM.',gsub('#','',module)), GS, gene.score) %>%
              filter(dataset$Module==module) %>% dplyr::rename('MM' = paste0('MM.',gsub('#','',module))) %>% 
              mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>% top_n(20)
  return(top_genes)
}

top_genes_1 = create_table(top_modules[1])
kable(top_genes_1, caption=paste0('Top 10 genes for module ', top_modules[1], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[1]),1],2),')'))
Top 10 genes for module #E08B00 (MTcor = 1)
ID external_gene_id MM GS gene.score importance
ENSG00000124151 NCOA3 0.9176374 0.9999000 None 0.9587687
ENSG00000137642 SORL1 0.9055291 0.9939057 None 0.9497174
ENSG00000079156 OSBPL6 0.8472526 0.9504339 None 0.8988433
ENSG00000161326 DUSP14 0.9015386 0.8873720 None 0.8944553
ENSG00000198576 ARC 0.8706669 0.9058439 None 0.8882554
ENSG00000166016 ABTB2 0.8894032 0.8731209 None 0.8812621
ENSG00000146263 MMS22L 0.7617517 0.9999000 None 0.8808258
ENSG00000081923 ATP8B1 0.8139837 0.9460922 None 0.8800379
ENSG00000172071 EIF2AK3 0.8679807 0.8898247 None 0.8789027
ENSG00000109971 HSPA8 0.9033004 0.8424429 None 0.8728717
ENSG00000106069 CHN2 0.8302260 0.9149886 None 0.8726073
ENSG00000133216 EPHB2 0.8982840 0.8375782 4 0.8679311
ENSG00000122035 RASL11A 0.8087259 0.9191441 None 0.8639350
ENSG00000188306 LRRIQ4 0.7614026 0.9637941 None 0.8625984
ENSG00000120738 EGR1 0.8579800 0.8565483 None 0.8572642
ENSG00000125740 FOSB 0.8131559 0.9002224 None 0.8566892
ENSG00000050438 SLC4A8 0.8304395 0.8810662 None 0.8557529
ENSG00000135537 LACE1 0.8932288 0.8105469 None 0.8518878
ENSG00000108231 LGI1 0.8507591 0.8509201 None 0.8508396
ENSG00000138792 ENPEP 0.8273163 0.8723425 None 0.8498294
top_genes_2 = create_table(top_modules[2])
kable(top_genes_2, caption=paste0('Top 10 genes for module ', top_modules[2], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[2]),1],2),')'))
Top 10 genes for module #7DAE00 (MTcor = 0.91)
ID external_gene_id MM GS gene.score importance
ENSG00000164168 TMEM184C 0.8879874 0.9237630 None 0.9058752
ENSG00000116857 TMEM9 0.9280139 0.8731886 None 0.9006012
ENSG00000171135 JAGN1 0.8007814 0.9999000 None 0.9003407
ENSG00000141524 TMC6 0.7743347 0.9582086 None 0.8662716
ENSG00000137872 SEMA6D 0.8004119 0.9293088 None 0.8648604
ENSG00000214534 ZNF705E 0.7526753 0.9394173 None 0.8460463
ENSG00000167797 CDK2AP2 0.8649232 0.8228135 None 0.8438684
ENSG00000204604 ZNF468 0.8488296 0.8374370 None 0.8431333
ENSG00000143653 SCCPDH 0.8068247 0.8750338 None 0.8409292
ENSG00000181804 SLC9A9 0.7244863 0.9338530 4 0.8291697
ENSG00000163155 LYSMD1 0.8369816 0.7986530 None 0.8178173
ENSG00000014138 POLA2 0.8463754 0.7761639 4 0.8112697
ENSG00000168917 SLC35G2 0.7991949 0.7969682 None 0.7980815
ENSG00000137809 ITGA11 0.7219140 0.8648469 None 0.7933804
ENSG00000249709 ZNF564 0.7461865 0.8162660 None 0.7812262
ENSG00000111481 COPZ1 0.7158520 0.8392657 None 0.7775589
ENSG00000106397 PLOD3 0.7313900 0.8214711 None 0.7764306
ENSG00000184986 TMEM121 0.8580479 0.6924940 None 0.7752709
ENSG00000140279 DUOX2 0.7465182 0.7988820 None 0.7727001
ENSG00000175426 PCSK1 0.6929284 0.8359477 None 0.7644381
top_genes_2 = create_table(top_modules[3])
kable(top_genes_2, caption=paste0('Top 10 genes for module ', top_modules[3], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[3]),1],2),')'))
Top 10 genes for module #FE6E8A (MTcor = -0.95)
ID external_gene_id MM GS gene.score importance
ENSG00000160345 C9orf116 0.9195470 -0.9108297 None 0.9151883
ENSG00000178386 ZNF223 0.8202660 -0.9953870 None 0.9078265
ENSG00000186439 TRDN 0.8967728 -0.8866637 None 0.8917183
ENSG00000188312 CENPP 0.7657596 -0.9999000 None 0.8828298
ENSG00000137944 CCBL2 0.9051492 -0.8586605 None 0.8819049
ENSG00000114378 HYAL1 0.7642047 -0.9661230 None 0.8651638
ENSG00000146707 POMZP3 0.8031535 -0.9100025 None 0.8565780
ENSG00000163606 CD200R1 0.8933894 -0.7933818 None 0.8433856
ENSG00000196812 ZSCAN16 0.7437446 -0.9368648 None 0.8403047
ENSG00000101898 RP3-324O17.4 0.8346706 -0.8343198 None 0.8344952
ENSG00000105173 CCNE1 0.7561475 -0.9077603 None 0.8319539
ENSG00000101639 CEP192 0.7332188 -0.9181599 None 0.8256893
ENSG00000101213 PTK6 0.7500411 -0.8513621 None 0.8007016
ENSG00000137463 MGARP 0.8101799 -0.7691698 None 0.7896748
ENSG00000166347 CYB5A 0.7862803 -0.7866927 None 0.7864865
ENSG00000181929 PRKAG1 0.7813053 -0.7820870 None 0.7816962
ENSG00000159618 GPR114 0.7008648 -0.8202101 None 0.7605374
ENSG00000183281 PLGLB1 0.7033820 -0.7168923 None 0.7101371
ENSG00000239887 C1orf226 0.6861897 -0.7199592 None 0.7030744
ENSG00000144741 SLC25A26 0.7355508 -0.6582607 None 0.6969058
rm(create_table)
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), 'gray')) %>%
            mutate(alpha = ifelse(color %in% top_modules & 
                                  ID %in% c(as.character(top_genes_1$ID), 
                                            as.character(top_genes_2$ID)), 1, 0.05))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')




Enrichment Analysis


Using the package anRichment

# Prepare dataset

# Create dataset with top modules membership and removing the genes without an assigned module
EA_dataset = data.frame('ensembl_gene_id' = genes_info$ID,
                        module = ifelse(genes_info$Module %in% top_modules, genes_info$Module, 'other')) %>%
             filter(genes_info$Module!='gray')

# Assign Entrez Gene Id to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=EA_dataset$ensembl_gene_id, mart=mart)
## Cache found
EA_dataset = EA_dataset %>% left_join(biomart_output, by='ensembl_gene_id')

for(tm in top_modules){
  cat(paste0('\n',sum(EA_dataset$module==tm & is.na(EA_dataset$entrezgene)), ' genes from top module ',
             tm, ' don\'t have an Entrez Gene ID')) 
}
## 
## 5 genes from top module #E08B00 don't have an Entrez Gene ID
## 3 genes from top module #7DAE00 don't have an Entrez Gene ID
## 0 genes from top module #FE6E8A don't have an Entrez Gene ID
rm(getinfo, mart, biomart_output, tm)
# Manual: https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/GeneAnnotation/Tutorials/anRichment-Tutorial1.pdf

collectGarbage()

# EA_dataset = rbind(EA_dataset[EA_dataset$module!='other',], EA_dataset[EA_dataset$module=='other',][sample(sum(EA_dataset$module=='other'), 1000),])

# Prepare datasets
GO_col = buildGOcollection(organism = 'human', verbose = 0)
## Loading required package: org.Hs.eg.db
## 
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
internal_col = internalCollection(organism = 'human')
MillerAIBS_col = MillerAIBSCollection(organism = 'human')
BrainDisease_col = BrainDiseaseCollection(organism = 'human')
combined_col = mergeCollections(GO_col, internal_col, MillerAIBS_col, BrainDisease_col)

# Print collections used
cat('Using collections: ')
## Using collections:
knownGroups(combined_col, sortBy = 'size')
##  [1] "GO"                                         
##  [2] "GO.BP"                                      
##  [3] "GO.MF"                                      
##  [4] "GO.CC"                                      
##  [5] "JA Miller at AIBS"                          
##  [6] "Chip-X enrichment analysis (ChEA)"          
##  [7] "Brain"                                      
##  [8] "JAM"                                        
##  [9] "Prenatal brain"                             
## [10] "Brain region markers"                       
## [11] "Cortex"                                     
## [12] "Brain region marker enriched gene sets"     
## [13] "WGCNA"                                      
## [14] "BrainRegionMarkers"                         
## [15] "BrainRegionMarkers.HBA"                     
## [16] "BrainRegionMarkers.HBA.localMarker(top200)" 
## [17] "Postnatal brain"                            
## [18] "ImmunePathways"                             
## [19] "Markers of cortex layers"                   
## [20] "BrainLists"                                 
## [21] "Cell type markers"                          
## [22] "Germinal brain"                             
## [23] "BrainRegionMarkers.HBA.globalMarker(top200)"
## [24] "Accelerated evolution"                      
## [25] "Postmitotic brain"                          
## [26] "BrainLists.Blalock_AD"                      
## [27] "BrainLists.DiseaseGenes"                    
## [28] "BloodAtlases"                               
## [29] "Verge Disease Genes"                        
## [30] "BloodAtlases.Whitney"                       
## [31] "BrainLists.JAXdiseaseGene"                  
## [32] "BrainLists.MO"                              
## [33] "Age-associated genes"                       
## [34] "BrainLists.Lu_Aging"                        
## [35] "Cell type marker enriched gene sets"        
## [36] "BrainLists.CA1vsCA3"                        
## [37] "BrainLists.MitochondrialType"               
## [38] "BrainLists.MO.2+_26Mar08"                   
## [39] "BrainLists.MO.Sugino"                       
## [40] "BloodAtlases.Gnatenko2"                     
## [41] "BloodAtlases.Kabanova"                      
## [42] "BrainLists.Voineagu"                        
## [43] "StemCellLists"                              
## [44] "StemCellLists.Lee"
# Perform Enrichment Analysis
enrichment = enrichmentAnalysis(classLabels = EA_dataset$module, identifiers = EA_dataset$entrezgene,
                                refCollection = combined_col, useBackground = 'given', 
                                threshold = 1e-4, thresholdType = 'Bonferroni',
                                getOverlapEntrez = FALSE, getOverlapSymbols = TRUE)
##  enrichmentAnalysis: preparing data..
##   ..working on label set 1 ..


Results


None of the top modules seem to be enriched in anything! (See Bonferroni and FDR columns)

None of the modules seem to be enriched in anything!!! (lowest p-value in the whole enrichment analysis is 2.6e-5, which becomes 1 after the Bonferroni correction)

kable(enrichment$enrichmentTable %>% filter(class==top_modules[1]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR,
                    effectiveClassSize, effectiveSetSize, nCommonGenes),
      caption = paste0('Enriched terms for module ', top_modules[1], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[1]][1],4), ')'))
Enriched terms for module #E08B00 (MTcor = 0.9999)
dataSetID shortDataSetName inGroups Bonferroni FDR effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000526 Genes bound by TFAP2A in human MCF7 from PMID 17053090 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1 1 221 1388 38
GO:0080135 regulation of cellular response to stress GO|GO.BP 1 1 221 656 22
GO:2000273 positive regulation of signaling receptor activity GO|GO.BP 1 1 221 35 5
JAM:002866 GlutamatergicNeuronsInMouseCortex_Sugino JAM|BrainLists|BrainLists.MO|BrainLists.MO.Sugino|Cell type marker enriched gene sets|Brain region marker enriched gene sets 1 1 221 367 15
GO:1900450 negative regulation of glutamate receptor signaling pathway GO|GO.BP 1 1 221 2 2
JAM:002967 Occipital Lobe_IN_Cerebral Cortex JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 1 1 221 148 9
JAMiller.AIBS.000505 Genes bound by SUZ12 in MOUSE MESC from PMID 18974828 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1 1 221 1374 35
JAMiller.AIBS.000151 Lowest in CP of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex 1 1 221 559 19
GO:0045833 negative regulation of lipid metabolic process GO|GO.BP 1 1 221 68 6
GO:0030515 snoRNA binding GO|GO.MF 1 1 221 26 4
kable(enrichment$enrichmentTable %>% filter(class==top_modules[2]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR,
                    effectiveClassSize, effectiveSetSize, nCommonGenes),
      caption = paste0('Enriched terms for module ', top_modules[2], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[2]][1],4), ')'))
Enriched terms for module #7DAE00 (MTcor = 0.9078)
dataSetID shortDataSetName inGroups Bonferroni FDR effectiveClassSize effectiveSetSize nCommonGenes
GO:0006493 protein O-linked glycosylation GO|GO.BP 1 1 144 87 6
GO:0017176 phosphatidylinositol N-acetylglucosaminyltransferase activity GO|GO.MF 1 1 144 6 2
GO:1902231 positive regulation of intrinsic apoptotic signaling pathway in response to DNA damage GO|GO.BP 1 1 144 7 2
JAMiller.AIBS.000002 SZo markers at 21 post-conception weeks JA Miller at AIBS|Brain|Prenatal brain|Cortex|Markers of cortex layers|Germinal brain 1 1 144 254 8
GO:0000506 glycosylphosphatidylinositol-N-acetylglucosaminyltransferase (GPI-GnT) complex GO|GO.CC 1 1 144 8 2
GO:0036066 protein O-linked fucosylation GO|GO.BP 1 1 144 8 2
GO:0006664 glycolipid metabolic process GO|GO.BP 1 1 144 103 5
GO:0042287 MHC protein binding GO|GO.MF 1 1 144 30 3
GO:1903509 liposaccharide metabolic process GO|GO.BP 1 1 144 104 5
GO:0048478 replication fork protection GO|GO.BP 1 1 144 9 2
kable(enrichment$enrichmentTable %>% filter(class==top_modules[3]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR,
                    effectiveClassSize, effectiveSetSize, nCommonGenes),
      caption = paste0('Enriched terms for module ', top_modules[3], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[3]][1],4), ')'))
Enriched terms for module #FE6E8A (MTcor = -0.9469)
dataSetID shortDataSetName inGroups Bonferroni FDR effectiveClassSize effectiveSetSize nCommonGenes
GO:0043062 extracellular structure organization GO|GO.BP 1 1 30 352 5
GO:0000095 S-adenosyl-L-methionine transmembrane transporter activity GO|GO.MF 1 1 30 1 1
GO:0002225 positive regulation of antimicrobial peptide production GO|GO.BP 1 1 30 1 1
GO:0002760 positive regulation of antimicrobial humoral response GO|GO.BP 1 1 30 1 1
GO:0002784 regulation of antimicrobial peptide production GO|GO.BP 1 1 30 1 1
GO:0002786 regulation of antibacterial peptide production GO|GO.BP 1 1 30 1 1
GO:0002803 positive regulation of antibacterial peptide production GO|GO.BP 1 1 30 1 1
GO:0015805 S-adenosyl-L-methionine transport GO|GO.BP 1 1 30 1 1
GO:0047315 kynurenine-glyoxylate transaminase activity GO|GO.MF 1 1 30 1 1
GO:1901846 positive regulation of cell communication by electrical coupling involved in cardiac conduction GO|GO.BP 1 1 30 1 1

Save Enrichment Analysis results

save(enrichment, file='./../Data/enrichmentAnalysis.RData')



Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.10.0                      
##  [2] BrainDiseaseCollection_1.00              
##  [3] anRichment_1.01-2                        
##  [4] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
##  [5] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2  
##  [6] GenomicFeatures_1.38.2                   
##  [7] GenomicRanges_1.38.0                     
##  [8] GenomeInfoDb_1.22.0                      
##  [9] anRichmentMethods_0.90-1                 
## [10] WGCNA_1.68                               
## [11] fastcluster_1.1.25                       
## [12] dynamicTreeCut_1.63-1                    
## [13] GO.db_3.10.0                             
## [14] AnnotationDbi_1.48.0                     
## [15] IRanges_2.20.2                           
## [16] S4Vectors_0.24.3                         
## [17] Biobase_2.46.0                           
## [18] BiocGenerics_0.32.0                      
## [19] biomaRt_2.42.0                           
## [20] knitr_1.24                               
## [21] doParallel_1.0.15                        
## [22] iterators_1.0.12                         
## [23] foreach_1.4.7                            
## [24] polycor_0.7-10                           
## [25] expss_0.10.1                             
## [26] GGally_1.4.0                             
## [27] gridExtra_2.3                            
## [28] viridis_0.5.1                            
## [29] viridisLite_0.3.0                        
## [30] RColorBrewer_1.1-2                       
## [31] dendextend_1.13.3                        
## [32] plotly_4.9.2                             
## [33] glue_1.3.1                               
## [34] reshape2_1.4.3                           
## [35] forcats_0.4.0                            
## [36] stringr_1.4.0                            
## [37] dplyr_0.8.3                              
## [38] purrr_0.3.3                              
## [39] readr_1.3.1                              
## [40] tidyr_1.0.2                              
## [41] tibble_2.1.3                             
## [42] ggplot2_3.2.1                            
## [43] tidyverse_1.3.0                          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.2-0                 BiocFileCache_1.10.2       
##   [5] plyr_1.8.5                  lazyeval_0.2.2             
##   [7] splines_3.6.0               crosstalk_1.0.0            
##   [9] BiocParallel_1.20.1         robust_0.4-18.2            
##  [11] digest_0.6.24               htmltools_0.4.0            
##  [13] fansi_0.4.1                 magrittr_1.5               
##  [15] checkmate_1.9.4             memoise_1.1.0              
##  [17] fit.models_0.5-14           cluster_2.0.8              
##  [19] annotate_1.64.0             Biostrings_2.54.0          
##  [21] modelr_0.1.5                matrixStats_0.55.0         
##  [23] askpass_1.1                 prettyunits_1.0.2          
##  [25] colorspace_1.4-1            blob_1.2.1                 
##  [27] rvest_0.3.5                 rappdirs_0.3.1             
##  [29] rrcov_1.4-7                 haven_2.2.0                
##  [31] xfun_0.8                    crayon_1.3.4               
##  [33] RCurl_1.95-4.12             jsonlite_1.6               
##  [35] genefilter_1.68.0           impute_1.60.0              
##  [37] survival_2.44-1.1           gtable_0.3.0               
##  [39] zlibbioc_1.32.0             XVector_0.26.0             
##  [41] DelayedArray_0.12.2         DEoptimR_1.0-8             
##  [43] scales_1.1.0                mvtnorm_1.0-11             
##  [45] DBI_1.1.0                   Rcpp_1.0.3                 
##  [47] xtable_1.8-4                progress_1.2.2             
##  [49] htmlTable_1.13.1            foreign_0.8-71             
##  [51] bit_1.1-15.2                preprocessCore_1.48.0      
##  [53] Formula_1.2-3               htmlwidgets_1.5.1          
##  [55] httr_1.4.1                  ellipsis_0.3.0             
##  [57] acepack_1.4.1               farver_2.0.3               
##  [59] pkgconfig_2.0.3             reshape_0.8.8              
##  [61] XML_3.99-0.3                nnet_7.3-12                
##  [63] dbplyr_1.4.2                locfit_1.5-9.1             
##  [65] later_1.0.0                 labeling_0.3               
##  [67] tidyselect_0.2.5            rlang_0.4.4                
##  [69] munsell_0.5.0               cellranger_1.1.0           
##  [71] tools_3.6.0                 cli_2.0.1                  
##  [73] generics_0.0.2              RSQLite_2.2.0              
##  [75] broom_0.5.4                 fastmap_1.0.1              
##  [77] evaluate_0.14               yaml_2.2.0                 
##  [79] bit64_0.9-7                 fs_1.3.1                   
##  [81] robustbase_0.93-5           nlme_3.1-139               
##  [83] mime_0.9                    xml2_1.2.2                 
##  [85] compiler_3.6.0              rstudioapi_0.10            
##  [87] curl_4.3                    reprex_0.3.0               
##  [89] geneplotter_1.64.0          pcaPP_1.9-73               
##  [91] stringi_1.4.6               highr_0.8                  
##  [93] lattice_0.20-38             Matrix_1.2-17              
##  [95] vctrs_0.2.2                 pillar_1.4.3               
##  [97] lifecycle_0.1.0             data.table_1.12.8          
##  [99] bitops_1.0-6                httpuv_1.5.2               
## [101] rtracklayer_1.46.0          R6_2.4.1                   
## [103] latticeExtra_0.6-28         promises_1.1.0             
## [105] codetools_0.2-16            MASS_7.3-51.4              
## [107] assertthat_0.2.1            SummarizedExperiment_1.16.1
## [109] DESeq2_1.26.0               openssl_1.4.1              
## [111] withr_2.1.2                 GenomicAlignments_1.22.1   
## [113] Rsamtools_2.2.2             GenomeInfoDbData_1.2.2     
## [115] hms_0.5.3                   grid_3.6.0                 
## [117] rpart_4.1-15                rmarkdown_1.14             
## [119] Cairo_1.5-10                shiny_1.4.0                
## [121] lubridate_1.7.4             base64enc_0.1-3